import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
1.A. Read ‘Car name.csv’ as a DataFrame and assign it to a variable
d1= pd.read_csv("C:\\Users\\HARITHA\\Car name.csv")
d1.head()
1.B. Read ‘Car-Attributes.json as a DataFrame and assign it to a variable. [
d2 = pd.read_json("Car-Attributes.json")
d2.head()
1.C. Merge both the DataFrames together to form a single DataFrame
df=d1.join(d2)
df.head()
df.shape
1.D. Print 5 point summary of the numerical features and share insights.
df.describe().T
sns.boxplot(df,orient='h');
2.A. Check and print feature-wise percentage of missing values present in the data and impute with the best suitable approach.
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing
df.isnull().sum()
2.B. Check for duplicate values in the data and impute with the best suitable approach
df.duplicated().sum()
2.C. Plot a pairplot for all features.
sns.pairplot(df,diag_kind='kde');
2.D. Visualize a scatterplot for ‘wt’ and ‘disp’. Datapoints should be distinguishable by ‘cyl’.
sns.scatterplot(data=df,x='wt',y='disp',hue='cyl',palette='dark');
2.E. Share insights for Q2.d
2.F. Visualize a scatterplot for ‘wt’ and ’mpg’. Datapoints should be distinguishable by ‘cyl’
sns.scatterplot(data=df,x='wt',y='mpg',hue='cyl',palette='bright');
2.G. Share insights for Q2.f
2.H. Check for unexpected values in all the features and datapoints with such values.
df['hp'].sample(25)
df[df['hp']=="?"]
df['hp'].replace("?",np.nan, inplace=True)
df['hp'].iloc[336]
df['hp'].dropna(inplace=True)
df['hp'].iloc[336]
df.shape
df.info()
df['hp'].fillna((df['hp'].median()), inplace=True)
df['hp'] = df['hp'].astype('float')
df.dtypes
# importing the StandardScaler Module
from sklearn.preprocessing import StandardScaler
# Creating an object for the StandardScaler function
X = StandardScaler()
scaled_df = X.fit_transform(df.iloc[:,1:9])
scaled_df
3.A. Apply K-Means clustering for 2 to 10 clusters.
df.isnull().sum()
df.dropna(inplace=True)
df.isnull().sum()
df.groupby(df['cyl']).mean()
# Calculate age of vehicle
df['age'] = 83-df['yr']
df.head()
#Convert origing into dummy variables (This again is subjected to business knowledge. We might drop this variable as well
# Inclusion is more to demonstrate on how to use categorical data)
one_hot = pd.get_dummies(df['origin'])
one_hot = one_hot.add_prefix('origin_')
# merge in main data frame
df = df.join(one_hot)
df.head()
df_new = df.drop(['yr','origin','car_name'], axis =1)
df_new.head()
sns.boxplot(data=df_new);
# We could see some outliers for mpg,hp and acc
sns.boxplot(y=df_new['mpg']);
sns.boxplot(y=df_new['hp']);
df_new['hp'] = np.log(df_new['hp'])
df_new['acc'] = np.log(df_new['acc'])
df_new['mpg'] = np.log(df_new['mpg'])
df_new.head()
sns.boxplot(data=df_new);
from scipy.stats import zscore
df_new.dtypes
numeric_cols = df_new.select_dtypes(include=[np.int64, np.float64]).columns
numeric_cols
df_new[numeric_cols] =df_new[numeric_cols].apply(zscore)
df_new.head()
cluster_range = range(2,11)
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans(num_clusters, n_init = 5)
clusters.fit(df_new)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append(clusters.inertia_)
clusters_df = pd.DataFrame({"num_clusters": cluster_range, "cluster_errors": cluster_errors})
clusters_df
3.B. Plot a visual and find elbow point
from matplotlib import cm
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
3.C. On the above visual, highlight which are the possible Elbow points
3.D. Train a K-means clustering model once again on the optimal number of clusters
#Set the value of k=4
kmeans = KMeans(n_clusters=4, n_init = 15, random_state=2345)
kmeans.fit(df_new)
centroids = kmeans.cluster_centers_
centroids
#Calculate the centroids for the columns to profile
centroid_df = pd.DataFrame(centroids, columns = list(df_new) )
print(centroid_df)
sns.scatterplot(centroid_df,legend='full')
3.E. Add a new feature in the DataFrame which will have labels based upon cluster value.
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
# Joining the label dataframe with the data frame.
df_labeled = df.join(df_labels)
df_analysis = (df_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe.
df_analysis
df_labeled['labels'].value_counts()
3.F. Plot a visual and color the datapoints based upon clusters
from mpl_toolkits.mplot3d import Axes3D
## 3D plots of clusters
fig = plt.figure(figsize=(8, 6))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=20, azim=60)
kmeans.fit(df_new)
labels = kmeans.labels_
ax.scatter(df_new.iloc[:, 0], df_new.iloc[:, 1],df_new.iloc[:,9],c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Length')
ax.set_ylabel('Height')
ax.set_zlabel('Weight')
ax.set_title('3D plot of KMeans Clustering')
3.G.Pass a new DataPoint and predict which cluster it belongs to
df_row={'car_name': 'ford torino',
'mpg': '17.0',
'cyl': '80',
'disp': '303.0',
'hp': '145.0',
'wt': '3431',
'acc': '11.5',
'yr': '80',
'origin':'1',
'age': '13',
'origin_1': '1',
'origin_2': '0',
'origin_3':'0'}
df= df.append(df_row,ignore_index=True )
df.shape
sns.scatterplot(data=df_new,legend='auto');
1.A. Read ‘vehicle.csv’ and save as DataFrame.
#For numerical libraries
import numpy as np
#To handle data in the form of rows and columns
import pandas as pd
#importing seaborn for statistical plots
import seaborn as sns
#importing ploting libraries
import matplotlib.pyplot as plt
#styling figures
plt.rc('font',size=14)
sns.set(style='white')
sns.set(style='whitegrid',color_codes=True)
#To enable plotting graphs in Jupyter notebook
%matplotlib inline
#importing the Encoding library
from sklearn.preprocessing import LabelEncoder
#Build the model with the best hyper parameters
from sklearn.model_selection import cross_val_score
#importing the zscore for scaling
from scipy.stats import zscore
#Importing PCA for dimensionality reduction and visualization
from sklearn.decomposition import PCA
# Import Logistic Regression machine learning library
from sklearn.linear_model import LogisticRegression
# Import Support Vector Classifier machine learning library
from sklearn.svm import SVC
#Import Naive Bayes' machine learning Library
from sklearn.naive_bayes import GaussianNB
#Import Sklearn package's data splitting function which is based on random function
from sklearn.model_selection import train_test_split
#Grid search to tune model parameters for SVC
from sklearn.model_selection import GridSearchCV
# Import the metrics
from sklearn import metrics
vehicle_df=pd.read_csv('C:\\Users\\HARITHA\\vehicle.csv')
vehicle_df.head(10)
vehicle_df.info()
1.B. Check percentage of missing values and impute with correct approach
vehicle_df.isnull().sum()
percent_missing = vehicle_df.isnull().sum() * 100 / len(df)
percent_missing
#class attribute is not an object it is a category
vehicle_df['class']=vehicle_df['class'].astype('category')
#To get the shape
vehicle_df.shape
#To get the number of columns
vehicle_df.columns
#Checking for missing values in the dataset
vehicle_df.isnull().sum()
#replace missing variable('?') into null variable using numpy
vehicle_df = vehicle_df.replace(' ', np.nan)
#Replacing the missing values by median
for i in vehicle_df.columns[:17]:
median_value = vehicle_df[i].median()
vehicle_df[i] = vehicle_df[i].fillna(median_value)
# again check for missing values
vehicle_df.isnull().sum()
# Again check data information
vehicle_df.info()
# Understand the spread and outliers in dataset using boxplot
vehicle_df.boxplot(figsize=(35,15));
# Histogram
vehicle_df.hist(figsize=(15,15));
#find the outliers and replace them by median
for col_name in vehicle_df.columns[:-1]:
q1 = vehicle_df[col_name].quantile(0.25)
q3 = vehicle_df[col_name].quantile(0.75)
iqr = q3 - q1
low = q1-1.5*iqr
high = q3+1.5*iqr
vehicle_df.loc[(vehicle_df[col_name] < low) | (vehicle_df[col_name] > high), col_name] = vehicle_df[col_name].median()
# again check for outliers in dataset using boxplot
vehicle_df.boxplot(figsize=(35,15));
print('Class: \n', vehicle_df['class'].unique())
vehicle_df['class'].value_counts()
#Encoding of categorical variables
labelencoder_X=LabelEncoder()
vehicle_df['class']=labelencoder_X.fit_transform(vehicle_df['class'])
#correlation matrix
cor=vehicle_df.corr()
cor
# correlation plot---heatmap
sns.set(font_scale=1.15)
fig,ax=plt.subplots(figsize=(18,15))
sns.heatmap(cor,vmin=0.8, annot=True,linewidths=0.01,center=0,linecolor="white",cbar=False,square=True)
plt.title('Correlation between attributes',fontsize=18)
ax.tick_params(labelsize=18)
#pair panel
sns.pairplot(vehicle_df,hue='class');
1.C. Visualize a Pie-chart and print percentage of values for variable ‘class’
vehicle_df['class'].value_counts()
per_class=vehicle_df['class'].value_counts() * 100 / len(vehicle_df['class'])
per_class
plt.pie(per_class, autopct='%1.1f%%');
1.D. Check for duplicate rows in the data and impute with correct approach.
vehicle_df.duplicated().sum()
2.A. Split data into X and Y. [Train and Test optional]
#independent and dependent variables
X=vehicle_df.iloc[:,0:18]
y = vehicle_df.iloc[:,18]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 10)
model = LogisticRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)
# check the accuracy on the training data
print('Accuracy on Training data: ',model.score(X_train, y_train))
# check the accuracy on the testing data
print('Accuracy on Testing data: ',model.score(X_test , y_test))
#Calculate the recall value
print('Recall value: ',metrics.recall_score(y_test, prediction, average='macro'))
#Calculate the precision value
print('Precision value: ',metrics.precision_score(y_test, prediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(prediction,y_test))
print("Classification Report:\n",metrics.classification_report(prediction,y_test))
resultsDf=pd.DataFrame({'Model':['Logistic'],'Accuracy': model.score(X_test , y_test)},index={'1'})
resultsDf=resultsDf[['Model','Accuracy']]
resultsDf
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
prediction = model.predict(X_test)
# check the accuracy on the training data
print('Accuracy on Training data: ',model.score(X_train, y_train))
# check the accuracy on the testing data
print('Accuracy on Testing data: ', model.score(X_test , y_test))
#Calculate the recall value
print('Recall value: ',metrics.recall_score(y_test, prediction, average='macro'))
#Calculate the precision value
print('Precision value: ',metrics.precision_score(y_test, prediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(prediction,y_test))
print("Classification Report:\n",metrics.classification_report(prediction,y_test))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Naive Bayes'], 'Accuracy': model.score(X_test, y_test)},index={'2'})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Model','Accuracy']]
resultsDf
clf = SVC()
clf.fit(X_train, y_train)
prediction = model.predict(X_test)
3.B. Print Classification metrics for train data.
# check the accuracy on the training data
print('Accuracy on Training data: ',model.score(X_train, y_train))
# check the accuracy on the testing data
print('Accuracy on Testing data: ', model.score(X_test , y_test))
#Calculate the recall value
print('Recall value: ',metrics.recall_score(y_test, prediction, average='macro'))
#Calculate the precision value
print('Precision value: ',metrics.precision_score(y_test, prediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(prediction,y_test))
print("Classification Report:\n",metrics.classification_report(prediction,y_test))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['SVM'], 'Accuracy': model.score(X_test, y_test)},index={'3'})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Model','Accuracy']]
resultsDf
3.C. Apply PCA on the data with 10 components.
# Scaling the independent attributes using zscore
X_z=X.apply(zscore)
# prior to scaling
plt.rcParams['figure.figsize']=(10,6)
plt.plot(vehicle_df)
plt.show()
#plt.plot(X_z,figsize=(20,10))
plt.rcParams['figure.figsize']=(10,6)
plt.plot(X_z)
plt.show()
# Calculating the covariance between attributes after scaling
cov_matrix = np.cov(X_z.T)
print('Covariance Matrix \n%s', cov_matrix)
#Finding eigenvalues amd eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Make a set of (eigenvalue, eigenvector) pairs
eigen_pairs = [(np.abs(eigenvalues[i]), eigenvectors[:,i]) for i in range(len(eigenvalues))]
eigen_pairs.sort(reverse=True)
eigen_pairs[:]
# print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigenvalues)
tot = sum(eigenvalues)
var_exp = [( i /tot ) * 100 for i in sorted(eigenvalues, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
# Ploting
plt.figure(figsize=(8 , 7))
plt.bar(range(1, eigenvalues.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eigenvalues.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
# Reducing from 17 to 10 dimension space
pca = PCA(n_components=10)
data_reduced = pca.fit_transform(X_z)
data_reduced.transpose()
pca.components_
X_comp = pd.DataFrame(pca.components_,columns=list(X_z))
X_comp.head()
# P_reduce represents reduced mathematical space.
# Reducing from 17 to 10 dimension space
P_reduce = np.array(eigenvectors[0:10])
# projecting original data into principal component dimensions
X_std_10D = np.dot(X_z,P_reduce.T)
# converting array to dataframe for pairplot
Proj_data_df = pd.DataFrame(X_std_10D)
#Let us check it visually
sns.pairplot(Proj_data_df, diag_kind='kde');
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(Proj_data_df,y, test_size = 0.3, random_state = 10)
model = LogisticRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)
# check the accuracy on the training data
print('Accuracy on Training data: ',model.score(X_train, y_train))
# check the accuracy on the testing data
print('Accuracy on Testing data: ',model.score(X_test , y_test))
#Calculate the recall value
print('Recall value: ',metrics.recall_score(y_test, prediction, average='macro'))
#Calculate the precision value
print('Precision value: ',metrics.precision_score(y_test, prediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(prediction,y_test))
print("Classification Report:\n",metrics.classification_report(prediction,y_test))
resultsDf=pd.DataFrame({'Model':['Logistic'],'Accuracy': model.score(X_test , y_test)},index={'1'})
resultsDf=resultsDf[['Model','Accuracy']]
resultsDf
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
prediction = model.predict(X_test)
# check the accuracy on the training data
print('Accuracy on Training data: ',model.score(X_train, y_train))
# check the accuracy on the testing data
print('Accuracy on Testing data: ', model.score(X_test , y_test))
#Calculate the recall value
print('Recall value: ',metrics.recall_score(y_test, prediction, average='macro'))
#Calculate the precision value
print('Precision value: ',metrics.precision_score(y_test, prediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(prediction,y_test))
print("Classification Report:\n",metrics.classification_report(prediction,y_test))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Naive Bayes'], 'Accuracy': model.score(X_test, y_test)},index={'2'})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Model','Accuracy']]
resultsDf
#Use the Naive Bayes CLassifier with k fold cross validation
scores = cross_val_score(model, Proj_data_df, y, cv=10)
print(scores)
print('Average score: ', np.mean(scores))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Naive Bayes k fold'], 'Accuracy': np.mean(scores)},index={'3'})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Model','Accuracy']]
resultsDf
clf = SVC()
clf.fit(X_train, y_train)
prediction = model.predict(X_test)
# check the accuracy on the training data
print('Accuracy on Training data: ',model.score(X_train, y_train))
# check the accuracy on the testing data
print('Accuracy on Testing data: ', model.score(X_test , y_test))
#Calculate the recall value
print('Recall value: ',metrics.recall_score(y_test, prediction, average='macro'))
#Calculate the precision value
print('Precision value: ',metrics.precision_score(y_test, prediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(prediction,y_test))
print("Classification Report:\n",metrics.classification_report(prediction,y_test))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['SVM'], 'Accuracy': model.score(X_test, y_test)},index={'4'})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Model','Accuracy']]
resultsDf
#Grid search to tune model parameters for SVC
from sklearn.model_selection import GridSearchCV
model = SVC()
params = {'C': [0.01, 0.1, 0.5, 1], 'kernel': ['linear', 'rbf']}
model1 = GridSearchCV(model, param_grid=params, verbose=5)
model1.fit(X_train, y_train)
print("Best Hyper Parameters:\n", model1.best_params_)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",model1.best_estimator_)
print("\n The best score across ALL searched params:\n",model1.best_score_)
print("\n The best parameters across ALL searched params:\n",model1.best_params_)
B. Share best Parameters observed from above step.
The best estimator across ALL searched params: SVC(C=0.1, kernel='linear')
The best score across ALL searched params: 0.9375587523144852
The best parameters across ALL searched params: {'C': 0.1, 'kernel': 'linear'}
#Build the model with the best hyper parameters
model = SVC(C=0.5, kernel="linear")
scores = cross_val_score(model, Proj_data_df, y, cv=10)
print(scores)
print(np.mean(scores))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['SVM k fold'], 'Accuracy': np.mean(scores)},index={'5'})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Model','Accuracy']]
resultsDf
#splitting the data in test and train sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 10)
# scaling the data using the standard scaler
from sklearn.preprocessing import StandardScaler
X_train_sd = StandardScaler().fit_transform(X_train)
X_test_sd = StandardScaler().fit_transform(X_test)
# generating the covariance matrix and the eigen values for the PCA analysis
cov_matrix = np.cov(X_train_sd.T) # the relevanat covariance matrix
print('Covariance Matrix \n%s', cov_matrix)
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix)
print('Eigenvectors \n%s' %e_vecs)
print('\nEigenvalues \n%s' %e_vals)
# the "cumulative variance explained" analysis
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(10 , 5))
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
eigen_pairs = [(np.abs(e_vals[i]), e_vecs[:,i]) for i in range(len(e_vals))]
eigen_pairs.sort(reverse=True)
eigen_pairs[:5]
# generating dimensionally reduced datasets
w = np.hstack((eigen_pairs[0][1].reshape(-1,1), eigen_pairs[1][1].reshape(-1,1)))
print('Matrix W:\n', w)
X_sd_pca = X_train_sd.dot(w)
X_test_sd_pca = X_test_sd.dot(w)
X_train_sd.shape, w.shape, X_sd_pca.shape, X_test_sd_pca.shape
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train_sd, y_train)
print ('Before PCA score', clf.score(X_test_sd, y_test))
clf.fit(X_sd_pca, y_train)
print ('After PCA score', clf.score(X_test_sd_pca, y_test))
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_sd, y_train)
print ('Before PCA score', model.score(X_test_sd, y_test))
model.fit(X_sd_pca, y_train)
print ('After PCA score', model.score(X_test_sd_pca, y_test))
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train_sd, y_train)
print ('Before PCA score', model.score(X_test_sd, y_test))
model.fit(X_sd_pca, y_train)
print ('After PCA score', model.score(X_test_sd_pca, y_test))
5.A. Explain pre-requisite/assumptions of PCA.
5.B. Explain advantages and limitations of PCA.
Advantages of PCA:
limitations of PCA: